Chapter 6.1.2 - Using word embeddings

Embedding layer with Keras



In [1]:

    
import keras









    



C:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



In [2]:

    
keras.__version__









    Out[2]:





'2.1.3'



In [3]:

    
from keras.layers import Embedding

# Number of maximum tokens is equal of maximum word index + 1
max_number_of_tokens = 1000
embedding_dimentionality = 64
embedding_layer = Embedding(max_number_of_tokens, embedding_dimentionality)

The layers transforms a 2D input tensor of integer of shape (number_of samples, sequence_length) into a 3D floating point tensor, of shape (number_of_samples, sequence_length, embedding_dimensionality.) Such tensor can be processed a RNN layer of a 1D convolutional layer.

IMDB example



In [4]:

    
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences



In [5]:

    
# Number of words considered as features
max_features = 10000



In [6]:

    
# Cutting reviews after only 20 words
sequence_max_length = 20



In [7]:

    
# Loading data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)



In [8]:

    
x_train.shape









    Out[8]:





(25000,)



In [9]:

    
x_train_sequence = pad_sequences(x_train, maxlen = sequence_max_length)



In [10]:

    
x_train_sequence.shape









    Out[10]:





(25000, 20)



In [11]:

    
x_train[0:2]









    Out[11]:





array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 8255, 2, 349, 2637, 148, 605, 2, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95])],
      dtype=object)



In [12]:

    
x_train[0].__getitem__(-20)









    Out[12]:





65



In [13]:

    
x_train_sequence[0, :]









    Out[13]:





array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])



In [14]:

    
x_train_sequence[0]









    Out[14]:





array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])



In [15]:

    
x_train_sequence[1]









    Out[15]:





array([  23,    4, 1690,   15,   16,    4, 1355,    5,   28,    6,   52,
        154,  462,   33,   89,   78,  285,   16,  145,   95])

Model



In [16]:

    
from keras.models import Sequential
from keras.layers import Flatten, Dense



In [17]:

    
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 8, input_length = sequence_max_length))
model.add(Flatten())
model.add(Dense(units = 1, activation = 'sigmoid'))



In [18]:

    
# Compiling the model
model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy', 
              metrics = ['acc'])



In [19]:

    
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
=================================================================
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________



In [20]:

    
# Training
history = model.fit(x = x_train_sequence, 
                    y = y_train, 
                    epochs = 10, 
                    batch_size = 32, 
                    validation_split = 0.2)









    



Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 5s 266us/step - loss: 0.6759 - acc: 0.6050 - val_loss: 0.6398 - val_acc: 0.6814
Epoch 2/10
20000/20000 [==============================] - 4s 198us/step - loss: 0.5657 - acc: 0.7427 - val_loss: 0.5467 - val_acc: 0.7206
Epoch 3/10
20000/20000 [==============================] - 4s 197us/step - loss: 0.4752 - acc: 0.7808 - val_loss: 0.5113 - val_acc: 0.7384
Epoch 4/10
20000/20000 [==============================] - 4s 195us/step - loss: 0.4263 - acc: 0.8077 - val_loss: 0.5008 - val_acc: 0.7452
Epoch 5/10
20000/20000 [==============================] - 4s 206us/step - loss: 0.3930 - acc: 0.8258 - val_loss: 0.4981 - val_acc: 0.7538
Epoch 6/10
20000/20000 [==============================] - 4s 198us/step - loss: 0.3668 - acc: 0.8395 - val_loss: 0.5014 - val_acc: 0.7530
Epoch 7/10
20000/20000 [==============================] - 4s 197us/step - loss: 0.3435 - acc: 0.8533 - val_loss: 0.5052 - val_acc: 0.7520
Epoch 8/10
20000/20000 [==============================] - 4s 215us/step - loss: 0.3223 - acc: 0.8657 - val_loss: 0.5132 - val_acc: 0.7486
Epoch 9/10
20000/20000 [==============================] - 5s 253us/step - loss: 0.3022 - acc: 0.8766 - val_loss: 0.5213 - val_acc: 0.7490
Epoch 10/10
20000/20000 [==============================] - 5s 243us/step - loss: 0.2839 - acc: 0.8860 - val_loss: 0.5303 - val_acc: 0.7466

Using pre-trained word embeddings

The data can be downloaded from: http://mng.bz/0tIo



In [21]:

    
import os



In [22]:

    
imdb_dir = './data/Chapter 6.1.2 - Using word embeddings/aclImdb/'



In [23]:

    
train_dir = os.path.join(imdb_dir, 'train')



In [24]:

    
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        # Taking into consideration files which are only .txt
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding="utf8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)



In [25]:

    
len(labels)









    Out[25]:





25000



In [26]:

    
len(texts)









    Out[26]:





25000



In [27]:

    
texts[0]









    Out[27]:





"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."



In [28]:

    
labels[0]









    Out[28]:





0

Tokenizing the data



In [29]:

    
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np



In [30]:

    
# Using only first 100 words of each review
maxlen = 100



In [31]:

    
# Number of training samples
training_samples = 200



In [32]:

    
# Number of validation samples
validation_samples = 10000



In [33]:

    
# Tokenizing only top 10 000 words in the dataset.
max_words = 10000



In [34]:

    
# Initializing Tokenizer
tokenizer = Tokenizer(num_words = max_words)



In [35]:

    
# Fitting the Tokenizer on the text
tokenizer.fit_on_texts(texts)



In [36]:

    
# Text to sequence
sequences = tokenizer.texts_to_sequences(texts)



In [37]:

    
sequences[0:2]



In [38]:

    
# Word index
word_index = tokenizer.word_index



In [39]:

    
type(word_index)









    Out[39]:





dict



In [40]:

    
first10pairs = {k: word_index[k] for k in list(word_index)[:10]}



In [41]:

    
first10pairs









    Out[41]:





{'a': 3,
 'and': 2,
 'br': 7,
 'i': 10,
 'in': 8,
 'is': 6,
 'it': 9,
 'of': 4,
 'the': 1,
 'to': 5}



In [42]:

    
# Padding the sequence
data = pad_sequences(sequences, maxlen = maxlen)



In [43]:

    
data.shape









    Out[43]:





(25000, 100)



In [44]:

    
labels = np.asarray(labels)



In [45]:

    
labels.shape









    Out[45]:





(25000,)



In [46]:

    
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]



In [47]:

    
# Splitting the data into train and validation datasets
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]



In [48]:

    
x_train.shape









    Out[48]:





(200, 100)



In [49]:

    
x_val.shape









    Out[49]:





(10000, 100)

GloVe Embedding

Download from: http://nlp.stanford.edu/data/glove.6B.zip



In [50]:

    
# Importing tqdm to show a progress bar
from tqdm import tqdm



In [51]:

    
glove_dir = './data/Chapter 6.1.2 - Using word embeddings/glove.6B/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 
         encoding = 'utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], 
                       dtype = 'float32')
    embeddings_index[word] = coefs
f.close()









    



400000it [00:20, 19883.21it/s]



In [52]:

    
len(embeddings_index)









    Out[52]:





400000



In [53]:

    
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        # Words not found in the embedding index will be represented as zeros
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector



In [54]:

    
embedding_matrix









    Out[54]:





array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.13787   , -0.17727   , -0.62436002, ...,  0.35506001,
         0.33443999,  0.14436001],
       [-0.88968998,  0.55208999, -0.50498998, ..., -0.54351002,
        -0.21874   ,  0.51186001],
       [-0.17381001, -0.037609  ,  0.068837  , ..., -0.097167  ,
         1.08840001,  0.22676   ]])

Model



In [55]:

    
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense



In [56]:

    
model = Sequential()
model.add(Embedding(input_dim = max_words, 
                    output_dim = embedding_dim, 
                    input_length = maxlen))
model.add(Flatten())
model.add(Dense(units = 32, 
                activation = 'relu'))
model.add(Dense(units = 1, 
                activation = 'sigmoid'))
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
=================================================================
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________



In [57]:

    
# Loading pretrained word embeddings
model.layers[0].set_weights([embedding_matrix])
# Freezing the layer
model.layers[0].trainable = False



In [58]:

    
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['acc'])



In [59]:

    
history = model.fit(x = x_train, 
                    y = y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_data = (x_val, y_val))









    



Train on 200 samples, validate on 10000 samples
Epoch 1/10
200/200 [==============================] - ETA: 2s - loss: 0.7250 - acc: 0.625 - 1s 7ms/step - loss: 1.6337 - acc: 0.5250 - val_loss: 0.7130 - val_acc: 0.5100
Epoch 2/10
200/200 [==============================] - ETA: 0s - loss: 0.6804 - acc: 0.468 - 1s 4ms/step - loss: 0.7565 - acc: 0.5800 - val_loss: 0.6910 - val_acc: 0.5418
Epoch 3/10
200/200 [==============================] - ETA: 0s - loss: 0.5768 - acc: 0.812 - 1s 4ms/step - loss: 0.5956 - acc: 0.6950 - val_loss: 1.1205 - val_acc: 0.4936
Epoch 4/10
200/200 [==============================] - ETA: 0s - loss: 1.1240 - acc: 0.375 - 1s 4ms/step - loss: 0.5335 - acc: 0.7350 - val_loss: 0.7134 - val_acc: 0.5362
Epoch 5/10
200/200 [==============================] - ETA: 0s - loss: 0.2450 - acc: 0.937 - 1s 4ms/step - loss: 0.4713 - acc: 0.8100 - val_loss: 0.7177 - val_acc: 0.5589
Epoch 6/10
200/200 [==============================] - ETA: 0s - loss: 0.1753 - acc: 0.968 - 1s 4ms/step - loss: 0.1448 - acc: 0.9800 - val_loss: 1.3373 - val_acc: 0.4952
Epoch 7/10
200/200 [==============================] - ETA: 0s - loss: 0.4039 - acc: 0.750 - 1s 4ms/step - loss: 0.2545 - acc: 0.8800 - val_loss: 1.3110 - val_acc: 0.4960
Epoch 8/10
200/200 [==============================] - ETA: 0s - loss: 0.2257 - acc: 0.968 - 1s 4ms/step - loss: 0.1102 - acc: 0.9800 - val_loss: 0.8168 - val_acc: 0.5558
Epoch 9/10
200/200 [==============================] - ETA: 0s - loss: 0.0502 - acc: 1.000 - 1s 4ms/step - loss: 0.0760 - acc: 0.9800 - val_loss: 1.5204 - val_acc: 0.5115
Epoch 10/10
200/200 [==============================] - ETA: 0s - loss: 0.2294 - acc: 0.906 - 1s 4ms/step - loss: 0.0680 - acc: 0.9850 - val_loss: 0.7458 - val_acc: 0.5759



In [60]:

    
model.save_weights('./saved_checkpoints/Chapter 6.1.2 - Using word embeddings/pre_trained_glove_model.h5')

Performance



In [61]:

    
import matplotlib.pyplot as plt



In [62]:

    
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

The model overfits very quickly.

Model without pre-trained embeddings



In [63]:

    
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
=================================================================
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________
Train on 200 samples, validate on 10000 samples
Epoch 1/10
200/200 [==============================] - ETA: 3s - loss: 0.6994 - acc: 0.375 - 2s 8ms/step - loss: 0.6951 - acc: 0.4350 - val_loss: 0.6950 - val_acc: 0.5167
Epoch 2/10
200/200 [==============================] - ETA: 0s - loss: 0.5114 - acc: 1.000 - 1s 4ms/step - loss: 0.5028 - acc: 0.9800 - val_loss: 0.7054 - val_acc: 0.5069
Epoch 3/10
200/200 [==============================] - ETA: 0s - loss: 0.3616 - acc: 0.968 - 1s 4ms/step - loss: 0.2898 - acc: 0.9850 - val_loss: 0.7012 - val_acc: 0.5187
Epoch 4/10
200/200 [==============================] - ETA: 0s - loss: 0.1380 - acc: 1.000 - 1s 4ms/step - loss: 0.1183 - acc: 1.0000 - val_loss: 0.7166 - val_acc: 0.5156
Epoch 5/10
200/200 [==============================] - ETA: 0s - loss: 0.0590 - acc: 1.000 - 1s 4ms/step - loss: 0.0524 - acc: 1.0000 - val_loss: 0.7150 - val_acc: 0.5288
Epoch 6/10
200/200 [==============================] - ETA: 0s - loss: 0.0273 - acc: 1.000 - 1s 4ms/step - loss: 0.0261 - acc: 1.0000 - val_loss: 0.7249 - val_acc: 0.5260
Epoch 7/10
200/200 [==============================] - ETA: 0s - loss: 0.0150 - acc: 1.000 - 1s 4ms/step - loss: 0.0141 - acc: 1.0000 - val_loss: 0.7211 - val_acc: 0.5389
Epoch 8/10
200/200 [==============================] - ETA: 0s - loss: 0.0090 - acc: 1.000 - 1s 4ms/step - loss: 0.0082 - acc: 1.0000 - val_loss: 0.7390 - val_acc: 0.5267
Epoch 9/10
200/200 [==============================] - ETA: 0s - loss: 0.0049 - acc: 1.000 - 1s 4ms/step - loss: 0.0049 - acc: 1.0000 - val_loss: 0.7283 - val_acc: 0.5393
Epoch 10/10
200/200 [==============================] - ETA: 0s - loss: 0.0032 - acc: 1.000 - 1s 4ms/step - loss: 0.0030 - acc: 1.0000 - val_loss: 0.7476 - val_acc: 0.5313



In [64]:

    
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()